About

In this markdown, we will do a comparison of the transcriptomes during development of Ptychodera, Amphixous, and Purple Sea Urchin.

For this, we will use a solution of custom R functions and wrappers that we have named “comparABle”.

Load libraries

library(tidyverse)
library(stringr)
library(BiocGenerics)
library(EDASeq)
library(DESeq2)
library(tximport)
library(limma)
library(ComplexHeatmap)
library(circlize)
library(RColorBrewer)
require(data.table)
require(stringr)
require(tidytree)
require(ggtree)
require(rvcheck)
require(treeio)
require(dendextend)
require(phangorn)
require(phytools)
require(ComplexHeatmap)
require(foreach)
require(doParallel)
require(colorspace)
library(topGO)

Load functions

source("code/r_code/r_functions/sourcefolder.R")

sourceFolder(
  "code/r_code/r_functions",
  recursive = TRUE
  )

sourceFolder(
  "code/r_code/r_general",
  recursive = TRUE
  )

Load Data

We will first load the data of our three species:

# Ptychodera analyses
load("outputs/rda/deseq2.rda")
# Amphioxus reanalysis
load("outputs/rda/ajap_reanalysis.rda")

We will follow by loading all the gene family data that we need. This is the gene/gfam lookup table (extracted from the OrthoXML output file from OMA) and the respective table to transform the gene indexes into the actual gene id of every species.

gene_gfam <- read.delim2("outputs/comparative/20240404_oma/oma_omaid_gfam.tsv", header = TRUE)
colnames(gene_gfam) <- c("id","gfam")
gene_gfam$id <- gsub("\\.p[0-9]+","",gene_gfam$id)

This gene_gfam file will be used later on as proxy to compare developmental stages.

head(gene_gfam)
##           id     gfam
## 1 CAEEL00007 HOG00001
## 2 SCHMD09491 HOG00001
## 3 SCHMD28687 HOG00002
## 4 SCHMD02140 HOG00002
## 5 CAEEL00052 HOG00002
## 6 CAEEL00105 HOG00003
oma_pfla_ajap <- read.delim2("outputs/comparative/oma/1to1_orthologs/1to1_pfla_ajap.tsv", header = FALSE)[,c(2,1)]
oma_pfla_ajap$V2 <- gsub(" ","",oma_pfla_ajap$V2)
load("outputs/rda/geneage.rda")

For more functional annotation, we will use the COG functional categories and the GO terms of Ptychodera.

pfla_cogs <- read.table(
  "outputs/functional_annotation/COGs/pfla_cogs.tsv",
  col.names = c("id","cog")
)

# GO terms
pfla_id2go <- 
  readMappings(
    "outputs/functional_annotation/go_blast2go/GO_annotation.txt"
  )

Ptychodera and Amphioxus

We will first compare the transcriptomes and stage-specific clusters of Ptychodera and Amphioxus. For this, we will define a number of objects that will enter into our custom wrapper function comparABle. The most important of them being:

In addition, we will also pass it a list of gene ontologies, a COG functional category association file, and a list of gene ages that are shared between species A and B.

# Expression data
a = pfla_rna_counts
b = ajap_counts

# Samples for rowmeans_by repl
a_samples = levels(condition_x)
b_samples = unique(sub("_.$", "", colnames(b)))

# Family/orthology data
o = unique(oma_pfla_ajap)
f = gene_gfam[grep("TCONS|AENJA",gene_gfam$id),]

# Module/Cluster information
ma = data.frame(
  id = rownames(pfla_rna_dev),
  module = pfla_rna_dev$cID
)

mb = ajap_cl
colnames(mb) <- c("id","module")

# Gene Age
ga = pfla_age[,c(1,3)]
colnames(ga) = c("id","age")

# COG
cog_a <- pfla_cogs

# GOs
a_universe = rownames(vsd_allgen)
a_id2go = pfla_id2go

# Common Evo Nodes
common_evo_nodes = unique(ga$age)[!(unique(ga$age) %in% c("7_Hemich","8_Pfla"))]

Below we will run the comparABle wrapper. This can take some time.

PFLA_AJAP_COMPARISON <- comparABle(
  a_name = "P.flava",
  b_name = "A.japonica",
  a = a,
  b = b,
  o = o,
  f = f,
  ma = ma,
  mb = mb,
  ga = ga,
  gb = gb,
  cog_a = cog_a,
  cog_b = cog_b,
  a_samples = a_samples,
  b_samples = b_samples,
  highlyvariable = FALSE,
  cooc_p = 0.05,
  cooc_h = c(0.70,0.95),
  cooc_cor_method = "pearson",
  a_universe = a_universe,
  a_id2go = a_id2go,
  common_evo_nodes = common_evo_nodes,
  sep = ",\ "
)
## [1] "Tidy up data"
## [1] "Merge data"
## [1] "Correlations"
## [1] "PCA"
## [1] "Co-Occurrence"
## [1] "Common genes in Correlations"
## The top five similar pair of stages are: 16.63416 16.88313 17.12493 17.12962 17.20074 
## [1] "Subsetting count matrices"
## [1] "Model fitting and top genes"
## [1] "Common genes in Correlations (GO)"
## [1] "Starting analysis 1 of 5"
## [1] "Starting analysis 2 of 5"
## [1] "Starting analysis 3 of 5"
## [1] "Starting analysis 4 of 5"
## [1] "Starting analysis 5 of 5"
## [1] "Common genes in Correlations (age)"
## [1] "Pairwise Orthology Overlap Strategy across modules -- hypergeometric and binonmial tests"
## [1] "Getting info on genes from shared families across modules"
## [1] "Starting analysis 1 of 20"
## [1] "Starting analysis 2 of 20"
## [1] "Starting analysis 3 of 20"
## [1] "Starting analysis 4 of 20"
## [1] "Starting analysis 5 of 20"
## [1] "Starting analysis 6 of 20"
## [1] "Starting analysis 7 of 20"
## [1] "Starting analysis 8 of 20"
## [1] "Starting analysis 9 of 20"
## [1] "Starting analysis 10 of 20"
## [1] "Starting analysis 11 of 20"
## [1] "Starting analysis 12 of 20"
## [1] "Starting analysis 13 of 20"
## [1] "Starting analysis 14 of 20"
## [1] "Starting analysis 15 of 20"
## [1] "Starting analysis 16 of 20"
## [1] "Starting analysis 17 of 20"
## [1] "Starting analysis 18 of 20"
## [1] "Starting analysis 19 of 20"
## [1] "Starting analysis 20 of 20"
## [1] "Plots"
## [1] "Generating results"

After running this we can start exploring the comparisons. First we will tidy up the pairwise correlation data and add names to the rows and columns of the jensen-shannon distance matrix.

Here are the heatmaps of pairwise correlations between Ptychodera and Amphioxus.

plot_cors(PFLA_AJAP_COMPARISON$pairwise_correlations)

A quick sanity check that this is consistently observed regardles of genes by doing some bootstraping and checkin the mean observed JSD:

js_mean_pfla_ajap <- jsd_with_subsampling(
  a_o = PFLA_AJAP_COMPARISON$merged_data$a_o , 
  b_o = PFLA_AJAP_COMPARISON$merged_data$b_o, 
  n = 1000, p = 0.25
)

#relativise to min and max values
js_mean_rel <- relativise(js_mean_pfla_ajap$mean)

h3_avg_rel <- Heatmap(js_mean_rel,cluster_rows = F, cluster_columns = F, show_row_names = TRUE, name = "JSD", col = brewer.pal(10,"RdBu"))
draw(h3_avg_rel)

Here is the average expression profile of genes more expressed in the most similar stages between Ptychodera an Amphioxus, including GO terms and age enrichment:

# Common genes, highly correlated
p1 <- plot_hicor_genes(
  scatter = PFLA_AJAP_COMPARISON$high_corr_genes$pairwise_data$hicor_topgenes[[1]]$plot_topgenes,
  GO_plot = PFLA_AJAP_COMPARISON$high_corr_genes$GOs$GOplot[[1]],
  age_table = PFLA_AJAP_COMPARISON$high_corr_genes$age[[1]]$AgeperModule,
  age_enrichemnt_hm = PFLA_AJAP_COMPARISON$high_corr_genes$age[[1]]$heatmap
)
p2 <- plot_hicor_genes(
  scatter = PFLA_AJAP_COMPARISON$high_corr_genes$pairwise_data$hicor_topgenes[[2]]$plot_topgenes,
  GO_plot = PFLA_AJAP_COMPARISON$high_corr_genes$GOs$GOplot[[2]],
  age_table = PFLA_AJAP_COMPARISON$high_corr_genes$age[[2]]$AgeperModule,
  age_enrichemnt_hm = PFLA_AJAP_COMPARISON$high_corr_genes$age[[2]]$heatmap
)
p3 <- plot_hicor_genes(
  scatter = PFLA_AJAP_COMPARISON$high_corr_genes$pairwise_data$hicor_topgenes[[3]]$plot_topgenes,
  GO_plot = PFLA_AJAP_COMPARISON$high_corr_genes$GOs$GOplot[[3]],
  age_table = PFLA_AJAP_COMPARISON$high_corr_genes$age[[3]]$AgeperModule,
  age_enrichemnt_hm = PFLA_AJAP_COMPARISON$high_corr_genes$age[[3]]$heatmap
)
p4 <- plot_hicor_genes(
  scatter = PFLA_AJAP_COMPARISON$high_corr_genes$pairwise_data$hicor_topgenes[[4]]$plot_topgenes,
  GO_plot = PFLA_AJAP_COMPARISON$high_corr_genes$GOs$GOplot[[4]],
  age_table = PFLA_AJAP_COMPARISON$high_corr_genes$age[[4]]$AgeperModule,
  age_enrichemnt_hm = PFLA_AJAP_COMPARISON$high_corr_genes$age[[4]]$heatmap
)
p5 <- plot_hicor_genes(
  scatter = PFLA_AJAP_COMPARISON$high_corr_genes$pairwise_data$hicor_topgenes[[5]]$plot_topgenes,
  GO_plot = PFLA_AJAP_COMPARISON$high_corr_genes$GOs$GOplot[[5]],
  age_table = PFLA_AJAP_COMPARISON$high_corr_genes$age[[5]]$AgeperModule,
  age_enrichemnt_hm = PFLA_AJAP_COMPARISON$high_corr_genes$age[[5]]$heatmap
)
plot_grid(p1,p2,p3,p4,p5,ncol = 1)

Here is the co-occurrence analysis showcasing the Amphioxus stages most similar to Ptychodera gastrulation are the stages corresponding to neurulation.

Heatmap(
  name = "co-occurrence",
  PFLA_AJAP_COMPARISON$coocurrence_analysis$cooccurrence,
  cluster_rows = PFLA_AJAP_COMPARISON$coocurrence_analysis$tree,
  cluster_columns = PFLA_AJAP_COMPARISON$coocurrence_analysis$tree,
  col = sequential_hcl(10,"YlOrRd", rev = TRUE)
)

Heatmap(
  name = "co-occurrence",
  PFLA_AJAP_COMPARISON$coocurrence_analysis$cooccurrence[1:16,17:32],
  cluster_rows = F,
  cluster_columns = F,
  col = sequential_hcl(10,"YlOrRd", rev = TRUE)
)

Here the orthology overlap strategy showcasing similarities at the gene family usage between hemichordate larval stages and amphioxus post-gastrulation development

pf_avg_hm+
PFLA_AJAP_COMPARISON$plots$orthology_overlap_binomial_hm+
PFLA_AJAP_COMPARISON$plots$orthology_overlap_hypgeom_hm

Here is a quick overview of the stats of the hypergeometric and binomial tests for pairwise comparisons:

summary(PFLA_AJAP_COMPARISON$orthology_overlap_modules$pairwise_module_comparison$stats)
##    module_a           module_b         success_in_samples sample_size       
##  Length:351         Length:351         Length:351         Length:351        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  success_in_pop       gene_POP          hypgeom_pval      hypgeom_log      
##  Length:351         Length:351         Min.   :0.00000   Min.   :  0.0000  
##  Class :character   Class :character   1st Qu.:0.01841   1st Qu.:  0.1405  
##  Mode  :character   Mode  :character   Median :0.36853   Median :  0.9982  
##                                        Mean   :0.44305   Mean   :  4.0118  
##                                        3rd Qu.:0.86893   3rd Qu.:  3.9955  
##                                        Max.   :1.00000   Max.   :130.1029  
##    binom_pval       gfams_common       gfams_excl_a       gfams_excl_b      
##  Min.   :0.000000   Length:351         Length:351         Length:351        
##  1st Qu.:0.003473   Class :character   Class :character   Class :character  
##  Median :0.131600   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :0.283764                                                           
##  3rd Qu.:0.515037                                                           
##  Max.   :1.000000

These is a quick overview at the number of genes in enriched gene families between pairs of comparisons:

# Group by module and summarize
commongenes <- PFLA_AJAP_COMPARISON$orthology_overlap_modules$
  genes_in_common_fams$commonfams$table_a_common %>%
  dplyr::group_by(module) %>%
  dplyr::summarize(
    numgenes = n(),
    genes = paste(ifelse(n() > 3, paste0(paste(head(id, 3), collapse = ",")," ..."), paste(id, collapse = ", ")), collapse = ", ")
  )

# Rename columns
colnames(commongenes) <- c("module", "numgenes", "genes")

commongenes
## # A tibble: 20 × 3
##    module numgenes genes                                           
##    <chr>     <int> <chr>                                           
##  1 01__2       170 TCONS_00031532,TCONS_00031878,TCONS_00032076 ...
##  2 02__2       166 TCONS_00031600,TCONS_00031699,TCONS_00022997 ...
##  3 02__4       133 TCONS_00022982,TCONS_00024592,TCONS_00024599 ...
##  4 04__16       38 TCONS_00024919,TCONS_00024987,TCONS_00023673 ...
##  5 06__2       147 TCONS_00038738,TCONS_00031602,TCONS_00031603 ...
##  6 06__4       218 TCONS_00038727,TCONS_00038493,TCONS_00038499 ...
##  7 06__6       101 TCONS_00025392,TCONS_00025508,TCONS_00024325 ...
##  8 07__4        98 TCONS_00039102,TCONS_00023445,TCONS_00023479 ...
##  9 07__6        54 TCONS_00025457,TCONS_00054882,TCONS_00055219 ...
## 10 14__10       47 TCONS_00032341,TCONS_00023837,TCONS_00055230 ...
## 11 14__12      168 TCONS_00039151,TCONS_00032287,TCONS_00023217 ...
## 12 15__10       46 TCONS_00038662,TCONS_00024041,TCONS_00025570 ...
## 13 15__11       33 TCONS_00024844,TCONS_00024862,TCONS_00025516 ...
## 14 16__11       51 TCONS_00023080,TCONS_00055791,TCONS_00009850 ...
## 15 17__12      156 TCONS_00039109,TCONS_00031633,TCONS_00032297 ...
## 16 17__15      137 TCONS_00039109,TCONS_00031633,TCONS_00031861 ...
## 17 19__15      110 TCONS_00038643,TCONS_00023063,TCONS_00024996 ...
## 18 19__16       70 TCONS_00023063,TCONS_00024996,TCONS_00024091 ...
## 19 22__15       92 TCONS_00032480,TCONS_00024615,TCONS_00024668 ...
## 20 22__16       62 TCONS_00024155,TCONS_00025613,TCONS_00054970 ...

And here the combination of gene age and functional categories enrichment:

PFLA_AJAP_COMPARISON$orthology_overlap_modules$
  genes_in_common_fams$commonfams$
    age_a_common$heatmap +
PFLA_AJAP_COMPARISON$orthology_overlap_modules$
  genes_in_common_fams$commonfams$
    cog_a_comon$heatmap

And here the gene ontology enrichment of enriched gene families between pairs of modules.

# Add a new column with the name of the original data frame
keygenes_commonfams_GO <- 
  bind_rows(
    PFLA_AJAP_COMPARISON$orthology_overlap_modules$
      genes_in_common_fams$commonfams$go_a_common$GOtable, 
    .id = "pair_gfams"
    )

# Rename the new column
colnames(keygenes_commonfams_GO)[1] <- "pair_gfams"

keygenes_commonfams_GO$logp <- 
  -log10(as.numeric(keygenes_commonfams_GO$classicFisher))

keygenes_commonfams_GO$logp[is.na(keygenes_commonfams_GO$logp)] <- 
  max(keygenes_commonfams_GO$logp, na.rm = T)

keygenes_commonfams_GO$obsexp <- 
  as.numeric(keygenes_commonfams_GO$Significant/keygenes_commonfams_GO$Expected)

keygenes_commonfams_GO <- 
  keygenes_commonfams_GO[
    order(
      keygenes_commonfams_GO$pair_gfams,
      keygenes_commonfams_GO$Term
      ),
    ]

keygenes_commonfams_GO$Term <- 
  factor(
    keygenes_commonfams_GO$Term, 
    levels = unique(keygenes_commonfams_GO$Term)
    )

The GO plot here:

gg <- keygenes_commonfams_GO  %>% 
  ggplot(
    aes(x=pair_gfams, y = Term, size = Significant, color = logp)
    ) + geom_point() +
  theme_bw() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 8 )
    )+ 
  theme(
    axis.text.y.left = element_text(hjust = 1, size = 8 ),
    legend.title=element_text(size=8 ),
    legend.text=element_text(size=8 )
    )+ 
  scale_size_continuous(
    name = "Genes affected",
    range = c(2,6)
    ) + scale_color_viridis("-log10(P-value)") +
  xlab("")+ylab("")

gg

Save the data

We will save these two objects for further analysis and reference:

save(
  PFLA_AJAP_COMPARISON,
  file = "outputs/rda/species_comparison_pfla_ajap.rda"
  )

Save the plots

pf_aj_js <- 
  Heatmap(
    js_mean_rel,
    cluster_rows = F,
    cluster_columns = F, 
    show_row_names = TRUE, 
    name = "JSD", 
    col = brewer.pal(10,"RdBu"),
    left_annotation = devstages_ha_rows(),
    top_annotation = quick_ha(colnames(js_mean_rel),"Heat", rev = TRUE)
    )
pdf(
  file = "graphics/pfla_ajap_jsd.pdf",
  height = 7,
  width = 7
)
draw(pf_aj_js)
dev.off()
## png 
##   2
pdf(
  file = "graphics/pfla_ajap_hypgeom.pdf",
  height = 5,
  width = 5
  )
draw(PFLA_AJAP_COMPARISON$plots$orthology_overlap_hypgeom_hm)
dev.off()
## png 
##   2
pdf(
  file = "graphics/pfla_ajap_cogs.pdf",
  height = 7,
  width = 5
)
draw(PFLA_AJAP_COMPARISON$orthology_overlap_modules$
  genes_in_common_fams$commonfams$
    cog_a_comon$heatmap)
dev.off()
## png 
##   2
save_plot(
  file = "graphics/pfla_ajap_hicor.pdf",
  base_height = 10,
  base_width = 10,
  plot_grid(
    p4,
    p5,
    ncol = 1
    )
  )